Isolated cells were sorted using a variety of cell markers
RNA was isolated and sequenced from the various cell types.
sample_table <- read.delim('sample.table', stringsAsFactors = FALSE)
rownames(sample_table) <- sample_table$Nextera_XT_index
sample_table$Nextera_XT_index <- NULL
sample_table$Replicates. <- NULL
sample_table$deseq_groups <- c("1","6","3","2","4","1","3","2","5","1","5","2","5","1","3","6","2","6","3")
datatable(sample_table) %>%
formatStyle("Phenotype", backgroundColor = styleEqual(c("CD33+","CD33+GFP-","CD33+GFP+","CD19+","CD19+GFP+"), c("#EAD3BF","#AA9486", "#B6854D","#9986A5","#79402E")))
# fpkm_data <- purrr::map(paste0("myriad/",rownames(sample_table),"/replicate_1/stringtie/",rownames(sample_table),".gene_abund.tab"), read.table, stringsAsFactors=FALSE, sep="\t", header = TRUE) %>% reduce(left_join, by = "Gene.ID") %>% select("Gene.ID","Gene.Name",contains("FPKM"))
#
# colnames(fpkm_data) <- c("Gene.ID","Gene.Name",rownames(sample_table))
# saveRDS(fpkm_data,"fpkm_data.rds")
fpkm_data <- readRDS("fpkm_data.rds")
## TPM data
# tpm_data <- purrr::map(paste0("myriad/",rownames(sample_table),"/replicate_1/stringtie/",rownames(sample_table),".gene_abund.tab"), read.table, stringsAsFactors=FALSE, sep="\t", header = TRUE) %>% reduce(left_join, by = "Gene.ID") %>% select("Gene.ID","Gene.Name",contains("TPM"))
#
# colnames(tpm_data) <- c("Gene.ID","Gene.Name",rownames(sample_table))
# saveRDS(tpm_data , "tpm_data.rds")
tpm_data <- readRDS("tpm_data.rds")
## Filter genes with a low expression throughout
# - select rows with max value >= 2
fpkm_data <- fpkm_data[apply(fpkm_data[,3:21], 1, max) >= 2,]
fpkms_counts_only <- fpkm_data[,3:21]
## Log transform the data
log.data = log2(fpkms_counts_only+1)
colnames(log.data) <- colnames(fpkm_data[,3:21])
rownames(log.data) <- make.names(fpkm_data$Gene.Name,unique = TRUE)
## Transpose the matrix back for PCA
t_log.data = t(log.data)
fit <- prcomp(t_log.data)
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))
ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) +
geom_label_repel(aes(label = Samples), size = 2,box.padding = 0.1,point.padding = 0.1,
segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
ggtitle(label = "FPKM PCA; no quantile normalization")
## Quantile normalization
normalized.data = normalize.quantiles(as.matrix(log.data)) # A quantile normalization.
## Transpose the matrix back for PCA
normalized.data = t(normalized.data)
## re-attach rownames colnames
rownames(normalized.data) <- colnames(log.data)
colnames(normalized.data) <- rownames(log.data)
fit <- prcomp(normalized.data)
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))
ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) +
geom_label_repel(aes(label = Samples), size = 2,box.padding = 0.1,point.padding = 0.1,
segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
ggtitle(label = "FPKM PCA; with quantile normalization")
## Filter genes with a low expression throughout
# - select rows with max value >= 2
tpm_data <- tpm_data[apply(tpm_data[,3:21], 1, max) >= 2,]
tpm_counts_only <- tpm_data[,3:21]
## Log transform the data
log.data = log2(tpm_counts_only+1)
colnames(log.data) <- colnames(tpm_data[,3:21])
rownames(log.data) <- make.names(tpm_data$Gene.Name,unique = TRUE)
## Transpose the matrix back for PCA
t_log.data = t(log.data)
fit <- prcomp(t_log.data)
x="PC1"
y="PC2"
x.lab = paste(x, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == x)]^2/sum(fit$sdev^2)))
y.lab = paste(y, sprintf('(%0.1f%% explained var.)', 100 * fit$sdev[which(colnames(fit$x) == y)]^2/sum(fit$sdev^2)))
data <- data.frame(fit$x)
data<-rownames_to_column(data, var = "sample")
data<-left_join(data,rownames_to_column(sample_table, var = "sample_id"), by = c("sample" = "sample_id"))
ggplot(data,aes(PC1,PC2, colour = Phenotype )) + geom_point(size = 3 ) +
geom_label_repel(aes(label = Samples), size = 2,box.padding = 0.1,point.padding = 0.1,
segment.color = 'grey50') + theme_minimal() + xlab(x.lab) + ylab(y.lab) +
ggtitle(label = "TPM PCA; no quantile normalization")
##PCA with counts ###Using scater normalization
# big_counts <- paste0("myriad/",rownames(sample_table),"/replicate_1/qc/",rownames(sample_table), "_QC.summary.txt/QC.geneCounts.formatted.for.DESeq.txt.gz") %>% map(gzfile) %>% map(read.table, stringsAsFactors = FALSE, sep = "\t") %>% reduce(left_join, by = "V1")
#
# colnames(big_counts) <- c("ensgene", rownames(sample_table))
# rownames(big_counts) <- big_counts$ensgene
# big_counts$ensgene <- NULL
# big_counts <- big_counts[!is.na(rowSums(big_counts)),]
# saveRDS(big_counts,"big_counts.rds")
big_counts <- readRDS("big_counts.rds")
count_matrix <- as.matrix(big_counts)
colnames(count_matrix) <- rownames(sample_table)
rownames(count_matrix) <- rownames(big_counts)
sce <- SingleCellExperiment(list(counts = count_matrix),
colData = sample_table)
sce <- sce[rowSums(counts(sce)) > 0,]
sce <- calculateQCMetrics(sce)
## prepare total count and total features data
my_df <- data.frame("sample_id" = colnames(sce), "total_counts" = sce$total_counts,
"total_features" = sce$total_features_by_counts )
ggplot(my_df,aes(total_counts)) + geom_histogram(bins = 10) +
theme_minimal() + ggtitle("Histogram of total counts") + ylab("number of samples") + xlab("total counts")
ggplot(my_df,aes(total_features)) + geom_histogram(bins = 10) +
theme_minimal() + ggtitle("Histogram of total features") + ylab("number of samples") + xlab("total features")
###PCA without batch effect removal
sizeFactors(sce) <- librarySizeFactors(sce)
sce <- normalize(sce)
sce <- runPCA(sce)
data <- reducedDim(sce)
percent_var_PC1<- paste("PC1", sprintf('(%0.1f%% explained var.)', attr(data, 'percentVar')[1] * 100 ))
percent_var_PC2<- paste("PC2", sprintf('(%0.1f%% explained var.)', attr(data, 'percentVar')[2] * 100 ))
data <- data.frame(data)
data<-left_join(rownames_to_column(data), rownames_to_column(sample_table))
ggplot(data,aes(PC1,PC2,colour = Phenotype)) + geom_point(size = 3) +
geom_label_repel(label = data$Samples, size = 2,box.padding = 0.1,point.padding = 0.1,
segment.color = 'grey50') + theme_minimal() + xlab(percent_var_PC1) + ylab(percent_var_PC2)
big_counts <- big_counts[-c(58736:58739),]
dds <- DESeqDataSetFromMatrix(big_counts, colData = sample_table, design = ~deseq_groups)
my_vst <- vst(dds)
pcaData <- DESeq2::plotPCA(my_vst, intgroup =c("Phenotype","Samples"),returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData,aes(PC1,PC2,colour = Phenotype)) + geom_point(size = 3 ) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_label_repel(aes(label = pcaData$Samples), size = 2,box.padding = 0.1,point.padding = 0.1,
segment.color = 'grey50') +
theme_minimal()
top_contribs = function(object) {
# calculate the variance for each gene
rv <- rowVars(assay(object))
# select the 1000 top genes by variance
select <- order(rv, decreasing=TRUE)[seq_len(min(1000, length(rv)))]
# perform a PCA on the data in assay(x) for the selected genes
pca <- prcomp(t(assay(object)[select,]))
# the contribution to the total variance for each component
percentVar <- pca$sdev^2 / sum( pca$sdev^2 )
# Top 20 contributers to PC1 PC2
PCA1_contrib <- sort(abs(pca$rotation[,1]), decreasing = TRUE )[1:20]
PCA2_contrib <- sort(abs(pca$rotation[,2]), decreasing = TRUE )[1:20]
PCA_contrib <- c(PCA1_contrib, PCA2_contrib)
PCA_contrib <- data.frame("ensgene" = names(PCA_contrib), PCA_contrib)
PCA_contrib <- cbind(PCA_contrib, data.frame("PC" = c(rep("PC1",20),rep("PC2","20"))))
PCA_contrib <- left_join(PCA_contrib, grch38, by = "ensgene") %>% dplyr::select("PC","symbol", "biotype")
return(PCA_contrib)
}
datatable(top_contribs(my_vst))
dds$deseq_groups <- relevel(dds$deseq_groups,ref = "2")
deseq <- DESeq(dds)
res <- results(deseq,contrast = c("deseq_groups", "1", "2"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj)
res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))
hallmark <- gmtPathways("~/genome_apps/GSEA/h.all.v6.2.symbols.gmt")
plot_gsea <- function(res){
res <- res[complete.cases(res),]
res$fcSign <- sign(res$log2FoldChange)
res$logP <- -log10(res$padj)
res$metric <- res$logP/res$fcSign
y<-res[,c("symbol", "metric")]
geneList <- y$metric
names(geneList) <- y$symbol
geneList <- geneList[order(geneList, decreasing = T)]
fgseaRes <- fgsea(hallmark, geneList, nperm=10000, maxSize = 500)
fgseaRes <- fgseaRes[fgseaRes$padj < 0.05,]
plotGseaTable(hallmark[fgseaRes$pathway], geneList, fgseaRes, gseaParam = 0.4)
}
plot_gsea(resOrdered)
res <- results(deseq,contrast = c("deseq_groups", "3", "2"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj)
res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))
plot_gsea(resOrdered)
dds$deseq_groups <- relevel(dds$deseq_groups,ref="4")
res <- results(deseq,contrast = c("deseq_groups", "5", "4"))
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[complete.cases(resOrdered),]
resOrdered <- data.frame(resOrdered)
resOrdered <- left_join(rownames_to_column(resOrdered,var = "ensgene"), grch38, by = "ensgene") %>% select(symbol,log2FoldChange,padj)
res_for_table <- resOrdered %>% dplyr::filter(padj < 0.1)
datatable(res_for_table, extensions = 'Buttons', options = list(dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel')))
plot_gsea(resOrdered)